#!/usr/bin/env perl
use warnings;
use strict;

#
# Builds a version of the probe set answers file in which each rating
# has a category. A rating is in category 1 if its movie was matched with
# a Wikipedia article, and it is in category 2 otherwise.
#
# Takes one argument: a file like big.movies.ids (see Makefile), with one
# netflix movie id on each line. No assumption is made about the order of
# the ids.
#
# Takes a probe_answers file on stdin (like the one in netflix/data).
# Produces the categorized file on stdout.
#

scalar(@ARGV) == 1 or die "need list of matched movie ids as argument";
my $matched_id_file = $ARGV[0];
-e$matched_id_file  or die "input id file $matched_id_file not found";

open(my $fh, "<$matched_id_file");
my @matched_ids = <$fh>;
close($fh);

# Use a set of integers for reasonable efficiency.
@matched_ids = map { int($_) } @matched_ids;
my %matched_ids = map { $_ => 1 } @matched_ids;

# Read probe input and write output as we go.
my $category;
while (<STDIN>) {
  chomp;
  if (/(\d+):/) {
    $category = $matched_ids{int($1)};
    if (!defined($category)) {
      $category = 2; # not matched
    }
    print;
  } else {
    defined($category) or die "expected probe answers to start with movie id";
    /(\d+,\d+,).*/ or die "unexpected rating line format ($_)";
    print $1, $category;
  }
  print "\n";
}

# Copyright (c) 2009 John Lees-Miller
# 
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
# 
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.

